# load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(brms) # bayesian regression
## Loading required package: Rcpp
## Loading 'brms' package (version 2.22.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
##
## Attaching package: 'brms'
##
## The following object is masked from 'package:stats':
##
## ar
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(tidyverse)
# read data
flight_data <- read.csv("flight_data_cleaned.csv")
# delete rows containing NA in the x variables
flight_data <- flight_data %>%
drop_na(Fare, Number.Of.Stops, Total_Minutes, distance,
IsWeekend, ifHoliday, Is_Low_Cost, Low_Cost_Count,
Departure.Off.Peak, Arrival.Off.Peak)
mean_fare = mean(flight_data$Fare)
sd_fare = sd(flight_data$Fare)
cat(mean_fare,"\n")
## 23769.17
cat(sd_fare,"\n")
## 20344.6
# basic linear regression model
lm_basic <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak,
data = flight_data)
# summary
summary(lm_basic)
##
## Call:
## lm(formula = Fare ~ Number.Of.Stops + Total_Minutes + distance +
## IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak, data = flight_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58312 -8929 -2390 4275 71496
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.610e+03 3.365e+02 19.642 < 2e-16 ***
## Number.Of.Stops 1.101e+03 2.142e+02 5.139 2.79e-07 ***
## Total_Minutes 5.858e+00 2.215e-01 26.454 < 2e-16 ***
## distance 3.415e+00 3.258e-02 104.804 < 2e-16 ***
## IsWeekend 1.351e+03 1.919e+02 7.037 2.01e-12 ***
## ifHoliday -1.657e+03 4.178e+02 -3.967 7.29e-05 ***
## Is_Low_Cost -4.486e+03 8.700e+02 -5.156 2.54e-07 ***
## Low_Cost_Count -1.960e+03 4.291e+02 -4.567 4.96e-06 ***
## Departure.Off.Peak 6.226e+02 1.965e+02 3.168 0.00154 **
## Arrival.Off.Peak 3.167e+03 2.015e+02 15.718 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14930 on 27438 degrees of freedom
## Multiple R-squared: 0.4617, Adjusted R-squared: 0.4615
## F-statistic: 2615 on 9 and 27438 DF, p-value: < 2.2e-16
# basic bayesian regression model
bayes_model <- brm(
Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak,
data = flight_data,
family = gaussian(),
chains = 4,
iter = 2000
)
## Compiling Stan program...
## Start sampling
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 9.6e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 8.876 seconds (Warm-up)
## Chain 1: 2.137 seconds (Sampling)
## Chain 1: 11.013 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 9e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.9 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 11.301 seconds (Warm-up)
## Chain 2: 2.165 seconds (Sampling)
## Chain 2: 13.466 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 8.9e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.89 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 10.328 seconds (Warm-up)
## Chain 3: 2.351 seconds (Sampling)
## Chain 3: 12.679 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 8.9e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.89 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 13.095 seconds (Warm-up)
## Chain 4: 2.302 seconds (Sampling)
## Chain 4: 15.397 seconds (Total)
## Chain 4:
# summary of bayesian regression model and bayesian R squared
summary(bayes_model)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + Arrival.Off.Peak
## Data: flight_data (Number of observations: 27448)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 6606.60 332.55 5949.19 7263.32 1.00 4409 3273
## Number.Of.Stops 1104.37 215.97 672.38 1523.31 1.00 3127 3100
## Total_Minutes 5.86 0.22 5.40 6.29 1.00 3474 2692
## distance 3.41 0.03 3.35 3.48 1.00 3999 2625
## IsWeekend 1349.16 192.54 974.83 1733.95 1.00 4132 2987
## ifHoliday -1659.98 413.90 -2476.49 -854.96 1.00 3939 2939
## Is_Low_Cost -4466.86 851.76 -6107.22 -2782.98 1.00 2302 2529
## Low_Cost_Count -1967.62 420.40 -2800.33 -1142.59 1.00 2307 2770
## Departure.Off.Peak 623.77 191.43 260.79 999.91 1.00 4134 3008
## Arrival.Off.Peak 3168.78 206.44 2763.71 3571.59 1.00 4048 2793
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 14929.46 64.76 14800.94 15056.29 1.00 6467 3043
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
bayes_R2(bayes_model)
## Estimate Est.Error Q2.5 Q97.5
## R2 0.4616525 0.003250939 0.4552978 0.4679927
# linear regression model with interaction term 1
lm_interaction1 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak+
IsWeekend:ifHoliday, # weekend and holiday
data = flight_data)
# compare model with interaction1 with basic model
anova(lm_basic, lm_interaction1)
## Analysis of Variance Table
##
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak + IsWeekend:ifHoliday
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27438 6.1152e+12
## 2 27437 6.1144e+12 1 806583409 3.6194 0.05712 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# linear regression model with interaction term 2
lm_interaction2 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak+
Is_Low_Cost:distance, # Differences in pricing strategies of low-cost operators across various distances
data = flight_data)
# compare model with interaction2 with basic model
anova(lm_basic, lm_interaction2)
## Analysis of Variance Table
##
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak + Is_Low_Cost:distance
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27438 6.1152e+12
## 2 27437 6.0416e+12 1 7.3568e+10 334.1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Visualize the pricing differences of low-cost operators across various distances
ggplot(flight_data, aes(x = distance, y = Fare, color = factor(Is_Low_Cost))) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm") +
theme_minimal() +
labs(title = "The distance-Fare relationship of Low-cost operators vs. regular operators",
color = "whether low-cost operator or not(0=no, 1=yes)")
## `geom_smooth()` using formula = 'y ~ x'
# linear regression model with interaction term 3
lm_interaction3 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak+
Total_Minutes:Number.Of.Stops, # an increase in the Number.Of.Stops on Total_Minutes affect Fare
data = flight_data)
# compare model with interaction3 with basic model
anova(lm_basic, lm_interaction3)
## Analysis of Variance Table
##
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak + Total_Minutes:Number.Of.Stops
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27438 6.1152e+12
## 2 27437 6.0974e+12 1 1.7716e+10 79.717 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analyze the impact of Total_Minutes on Fare under different Number.Of.Stops
ggplot(flight_data, aes(x = Total_Minutes, y = Fare, color = factor(Number.Of.Stops))) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm") +
theme_minimal() +
labs(title = "Total_Minutes-Fare relationship under different Number.Of.Stops",
color = "Number.Of.Stops")
## `geom_smooth()` using formula = 'y ~ x'
# linear regression model with interaction term 4
lm_interaction4 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak+
Departure.Off.Peak:Arrival.Off.Peak, # The combined effect of whether departure and arrival times are during peak hours
data = flight_data)
# compare model with interaction4 with basic model
anova(lm_basic, lm_interaction4)
## Analysis of Variance Table
##
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak + Departure.Off.Peak:Arrival.Off.Peak
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27438 6.1152e+12
## 2 27437 6.1137e+12 1 1464187478 6.571 0.01037 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# linear regression model with interaction term 5
lm_interaction5 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak+
Total_Minutes:ifHoliday, # Whether IfHoliday plays as moderator between totalMinutes and Fare
data = flight_data)
# compare model with interaction5 with basic model
anova(lm_basic, lm_interaction5)
## Analysis of Variance Table
##
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak + Total_Minutes:ifHoliday
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27438 6.1152e+12
## 2 27437 6.1148e+12 1 407926201 1.8304 0.1761
# linear regression model with interaction term 6
lm_interaction6 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak+
Total_Minutes:IsWeekend, # Whether IsWeekend plays as moderator totalMinutes and Fare
data = flight_data)
# compare model with interaction6 with basic model
anova(lm_basic, lm_interaction6)
## Analysis of Variance Table
##
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak + Total_Minutes:IsWeekend
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27438 6.1152e+12
## 2 27437 6.1149e+12 1 244793203 1.0984 0.2946
# linear regression model with interaction term 7
lm_interaction7 <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak+
Departure.Off.Peak:Number.Of.Stops, # passengers may tolerate more stops duting day flights compared to night flights
data = flight_data)
# compare model with interaction7 with basic model
anova(lm_basic, lm_interaction7)
## Analysis of Variance Table
##
## Model 1: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak
## Model 2: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend +
## ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak + Departure.Off.Peak:Number.Of.Stops
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27438 6.1152e+12
## 2 27437 6.1150e+12 1 138755764 0.6226 0.4301
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
# Define model names
model_names <- c("IsWeekend:ifHoliday", "Is_Low_Cost:distance", "Total_Minutes:Number.Of.Stops",
"Departure.Off.Peak:Arrival.Off.Peak", "Total_Minutes:ifHoliday", "Total_Minutes:IsWeekend",
"Departure.Off.Peak:Number.Of.Stops")
# Assume anova_list contains 7 ANOVA results
anova_list <- list(c("0.05712 .", 806583409), c("2.2e-16 ***", "7.3568e+10"),
c("2.2e-16 ***", "1.7716e+10"), c("0.01037 *", 1464187478),
c("0.1761", 407926201), c("0.2946", 244793203), c("0.4301", 138755764))
# Extract Sum of Squares and P-value, then create a data frame
anova_summary <- data.frame(
Model = model_names,
P_value = sapply(anova_list, function(x) x[1]),
Sum_of_Sq = sapply(anova_list, function(x) x[2])
)
# Create the table
table_grob <- tableGrob(anova_summary, rows = NULL)
# Set header background color
header_bg <- gpar(fill = "#d6d6d6", col = "black", fontsize = 13, fontface = "bold")
for (i in seq_len(ncol(table_grob))) {
table_grob$grobs[[i]]$gp <- header_bg
}
# Adjust font size
for (i in seq_along(table_grob$grobs)) {
table_grob$grobs[[i]]$gp <- gpar(fontsize = 12)
}
# Save high-resolution image
png("anova_table_academic.png", width = 12, height = 4, units = "in", res = 400)
grid.draw(table_grob)
dev.off()
## quartz_off_screen
## 2
# Display the table
grid.draw(table_grob)
# construct a comprehensive linear regression model including important interaction terms
lm_important_interactions <- lm(Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak +
Is_Low_Cost:distance + # interaction2
Total_Minutes:Number.Of.Stops + # interaction3
Departure.Off.Peak:Arrival.Off.Peak, # interaction4
data = flight_data)
# summary
summary(lm_important_interactions)
##
## Call:
## lm(formula = Fare ~ Number.Of.Stops + Total_Minutes + distance +
## IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak +
## Arrival.Off.Peak + Is_Low_Cost:distance + Total_Minutes:Number.Of.Stops +
## Departure.Off.Peak:Arrival.Off.Peak, data = flight_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61972 -8648 -2540 4088 73270
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.200e+02 5.874e+02 -0.375 0.707977
## Number.Of.Stops 5.511e+03 4.231e+02 13.025 < 2e-16 ***
## Total_Minutes 1.122e+01 5.029e-01 22.306 < 2e-16 ***
## distance 3.589e+00 3.373e-02 106.419 < 2e-16 ***
## IsWeekend 1.356e+03 1.903e+02 7.128 1.04e-12 ***
## ifHoliday -1.622e+03 4.144e+02 -3.915 9.07e-05 ***
## Is_Low_Cost 6.657e+03 1.081e+03 6.159 7.41e-10 ***
## Low_Cost_Count -5.092e+03 5.037e+02 -10.109 < 2e-16 ***
## Departure.Off.Peak 1.241e+03 2.327e+02 5.331 9.82e-08 ***
## Arrival.Off.Peak 3.637e+03 2.362e+02 15.396 < 2e-16 ***
## distance:Is_Low_Cost -2.049e+00 1.044e-01 -19.621 < 2e-16 ***
## Number.Of.Stops:Total_Minutes -3.642e+00 3.219e-01 -11.314 < 2e-16 ***
## Departure.Off.Peak:Arrival.Off.Peak -1.477e+03 4.320e+02 -3.419 0.000628 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14800 on 27435 degrees of freedom
## Multiple R-squared: 0.4708, Adjusted R-squared: 0.4706
## F-statistic: 2034 on 12 and 27435 DF, p-value: < 2.2e-16
# construct a comprehensive bayesian regression model including important interaction terms
bayes_important_interactions <- brm(
Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak +
Is_Low_Cost:distance +
Total_Minutes:Number.Of.Stops +
Departure.Off.Peak:Arrival.Off.Peak,
data = flight_data,
family = gaussian(),
chains = 4,
iter = 2000
)
## Compiling Stan program...
## Start sampling
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.000109 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.09 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 14.614 seconds (Warm-up)
## Chain 1: 3.263 seconds (Sampling)
## Chain 1: 17.877 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 9.6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 9.671 seconds (Warm-up)
## Chain 2: 3.189 seconds (Sampling)
## Chain 2: 12.86 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 9.5e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.95 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 11.307 seconds (Warm-up)
## Chain 3: 3.187 seconds (Sampling)
## Chain 3: 14.494 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 9.6e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 16.213 seconds (Warm-up)
## Chain 4: 3.079 seconds (Sampling)
## Chain 4: 19.292 seconds (Total)
## Chain 4:
# summary and bayesian R squared
summary(bayes_important_interactions)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: Fare ~ Number.Of.Stops + Total_Minutes + distance + IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count + Departure.Off.Peak + Arrival.Off.Peak + Is_Low_Cost:distance + Total_Minutes:Number.Of.Stops + Departure.Off.Peak:Arrival.Off.Peak
## Data: flight_data (Number of observations: 27448)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat
## Intercept -200.99 583.17 -1334.40 968.32 1.00
## Number.Of.Stops 5501.72 413.03 4692.94 6298.47 1.00
## Total_Minutes 11.20 0.50 10.23 12.18 1.00
## distance 3.59 0.03 3.52 3.66 1.00
## IsWeekend 1357.90 192.93 981.92 1740.77 1.00
## ifHoliday -1628.18 407.77 -2419.94 -823.82 1.00
## Is_Low_Cost 6623.99 1075.15 4471.19 8698.11 1.00
## Low_Cost_Count -5079.30 501.16 -6019.01 -4067.60 1.00
## Departure.Off.Peak 1244.03 230.99 786.98 1703.54 1.00
## Arrival.Off.Peak 3641.36 235.73 3184.41 4095.07 1.00
## distance:Is_Low_Cost -2.05 0.11 -2.25 -1.84 1.00
## Number.Of.Stops:Total_Minutes -3.63 0.31 -4.24 -3.01 1.00
## Departure.Off.Peak:Arrival.Off.Peak -1480.79 430.43 -2311.05 -603.31 1.00
## Bulk_ESS Tail_ESS
## Intercept 1551 2484
## Number.Of.Stops 1646 2347
## Total_Minutes 1819 2726
## distance 4758 2706
## IsWeekend 4669 2823
## ifHoliday 4722 3090
## Is_Low_Cost 1536 2506
## Low_Cost_Count 1694 2625
## Departure.Off.Peak 3741 3036
## Arrival.Off.Peak 3689 2947
## distance:Is_Low_Cost 4322 3005
## Number.Of.Stops:Total_Minutes 1533 2336
## Departure.Off.Peak:Arrival.Off.Peak 3090 2629
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 14803.63 63.98 14677.69 14936.09 1.00 7845 2295
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
bayes_R2(bayes_important_interactions)
## Estimate Est.Error Q2.5 Q97.5
## R2 0.4707822 0.003218736 0.4644047 0.4769575
# reproducibility
set.seed(123)
# define the model formula
model_formula <- Fare ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak +
Is_Low_Cost:distance +
Total_Minutes:Number.Of.Stops +
Departure.Off.Peak:Arrival.Off.Peak
# define 10 fold cv
train_control <- trainControl(
method = "cv",
number = 10, # 10-fold CV
verboseIter = TRUE
)
# cv
cv_model <- train(
model_formula,
data = flight_data,
method = "lm",
trControl = train_control
)
## + Fold01: intercept=TRUE
## - Fold01: intercept=TRUE
## + Fold02: intercept=TRUE
## - Fold02: intercept=TRUE
## + Fold03: intercept=TRUE
## - Fold03: intercept=TRUE
## + Fold04: intercept=TRUE
## - Fold04: intercept=TRUE
## + Fold05: intercept=TRUE
## - Fold05: intercept=TRUE
## + Fold06: intercept=TRUE
## - Fold06: intercept=TRUE
## + Fold07: intercept=TRUE
## - Fold07: intercept=TRUE
## + Fold08: intercept=TRUE
## - Fold08: intercept=TRUE
## + Fold09: intercept=TRUE
## - Fold09: intercept=TRUE
## + Fold10: intercept=TRUE
## - Fold10: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
# print the result
print(cv_model)
## Linear Regression
##
## 27448 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 24703, 24704, 24702, 24704, 24704, 24704, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 14804.66 0.4705339 10211.12
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# get more detailed prediction metrics
print(cv_model$results)
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 14804.66 0.4705339 10211.12 298.8753 0.01832668 192.2191
# visualize the prediction results
# use the final model to predict
predictions <- predict(cv_model, flight_data)
# scatter plot of prediction values vs. actual values
ggplot(data.frame(predicted = predictions, actual = flight_data$Fare),
aes(x = actual, y = predicted)) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(
title = "prediction values vs actual values",
x = "actual fare",
y = "prediction fare"
)
cv_metrics <- cv_model$resample
print(cv_metrics)
## RMSE Rsquared MAE Resample
## 1 14714.45 0.4753627 10216.176 Fold01
## 2 15017.53 0.4747368 10161.424 Fold02
## 3 14798.81 0.4785571 10081.270 Fold03
## 4 15112.10 0.4369495 10516.230 Fold04
## 5 14789.66 0.4785613 10258.759 Fold05
## 6 14809.83 0.4486708 10314.884 Fold06
## 7 14703.75 0.4881551 10044.551 Fold07
## 8 15153.71 0.4611778 10388.540 Fold08
## 9 14863.40 0.4641939 10293.771 Fold09
## 10 14083.40 0.4989744 9835.627 Fold10
# calculate Mean_RMSE/SD_RMSE/Mean_Rsquared/SD_RSquared
summary_metrics <- data.frame(
Mean_RMSE = mean(cv_metrics$RMSE),
SD_RMSE = sd(cv_metrics$RMSE),
Mean_Rsquared = mean(cv_metrics$Rsquared),
SD_Rsquared = sd(cv_metrics$Rsquared)
)
print(summary_metrics)
## Mean_RMSE SD_RMSE Mean_Rsquared SD_Rsquared
## 1 14804.66 298.8753 0.4705339 0.01832668
Why scale?
To make our RMSE range between 0-1 to evaluate the model more straightforward
# check the satistic values of "Fare"
mean_fare = mean(flight_data$Fare)
sd_fare = sd(flight_data$Fare)
cat("Fare's mean:", mean_fare, "\n")
## Fare's mean: 23769.17
cat("Fare's standard deviation:", sd_fare, "\n")
## Fare's standard deviation: 20344.6
# create data after scaling
flight_data_scaled <- flight_data %>%
mutate(Fare_scaled = scale(Fare))
# use scaled data for cv
set.seed(123)
train_control <- trainControl(
method = "cv",
number = 10,
verboseIter = TRUE
)
# use scaled data to create model
cv_model_scaled <- train(
Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak +
Is_Low_Cost:distance +
Total_Minutes:Number.Of.Stops +
Departure.Off.Peak:Arrival.Off.Peak,
data = flight_data_scaled,
method = "lm",
trControl = train_control
)
## + Fold01: intercept=TRUE
## - Fold01: intercept=TRUE
## + Fold02: intercept=TRUE
## - Fold02: intercept=TRUE
## + Fold03: intercept=TRUE
## - Fold03: intercept=TRUE
## + Fold04: intercept=TRUE
## - Fold04: intercept=TRUE
## + Fold05: intercept=TRUE
## - Fold05: intercept=TRUE
## + Fold06: intercept=TRUE
## - Fold06: intercept=TRUE
## + Fold07: intercept=TRUE
## - Fold07: intercept=TRUE
## + Fold08: intercept=TRUE
## - Fold08: intercept=TRUE
## + Fold09: intercept=TRUE
## - Fold09: intercept=TRUE
## + Fold10: intercept=TRUE
## - Fold10: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
print(cv_model_scaled)
## Linear Regression
##
## 27448 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 24703, 24704, 24702, 24704, 24704, 24703, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.7276648 0.4705238 0.5018736
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# visualize prediction results
predictions_scaled <- predict(cv_model_scaled, flight_data_scaled)
ggplot(data.frame(predicted = predictions_scaled,
actual = flight_data_scaled$Fare_scaled),
aes(x = actual, y = predicted)) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(
title = "predictive fare VS actual fare after scaling",
x = "actual fare - scaled",
y = "predictive fare - scaled"
)
# Bayesian Model Cross Validation
# Set random seed
set.seed(123)
# Create 10-fold indices
folds <- createFolds(flight_data_scaled$Fare_scaled, k = 10, list = TRUE)
# Store prediction results for each fold
cv_results_bayes <- list()
rmse_values <- numeric(10)
r2_values <- numeric(10)
# Perform cross validation for each fold
for(i in 1:10) {
# Split training and test sets
test_indices <- folds[[i]]
train_data <- flight_data_scaled[-test_indices, ]
test_data <- flight_data_scaled[test_indices, ]
# Train Bayesian model
bayes_cv_model <- brm(
Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak +
Is_Low_Cost:distance +
Total_Minutes:Number.Of.Stops +
Departure.Off.Peak:Arrival.Off.Peak,
data = train_data,
family = gaussian(),
chains = 4,
iter = 2000,
cores = 4,
silent = 2
)
# Predict test set
predictions <- predict(bayes_cv_model, newdata = test_data)
# Calculate RMSE
rmse_values[i] <- sqrt(mean((predictions[,"Estimate"] - test_data$Fare_scaled)^2))
# Calculate R²
r2_values[i] <- bayes_R2(bayes_cv_model)[,"Estimate"]
}
## Warning: There were 3364 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3314 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.38, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3295 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.21, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 2879 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: There were 2688 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3094 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.21, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3456 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.45, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3428 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.08, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 2541 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
## Warning: There were 3124 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.18, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
# Calculate average metrics
bayes_cv_metrics <- data.frame(
Mean_RMSE = mean(rmse_values),
SD_RMSE = sd(rmse_values),
Mean_Rsquared = mean(r2_values),
SD_Rsquared = sd(r2_values)
)
# Print results
print("Bayesian Model Cross Validation Results:")
## 188.683 seconds (Total)
## Chain 4:
print(bayes_cv_metrics)
# Compare Bayesian CV results with previous results
cv_comparison <- data.frame(
Model = c("Linear Model CV", "Bayesian Model CV"),
RMSE = c(mean(cv_model_scaled$resample$RMSE), bayes_cv_metrics$Mean_RMSE),
Rsquared = c(mean(cv_model_scaled$resample$Rsquared), bayes_cv_metrics$Mean_Rsquared)
)
print("Model Comparison:")
## [1] "Model Comparison:"
print(cv_comparison)
## Model RMSE Rsquared
## 1 Linear Model CV 0.7276648 0.4705238
## 2 Bayesian Model CV 0.7276500 0.4708232
# Visualize comparison
boxplot(
list(
"Linear Model" = cv_model_scaled$resample$RMSE,
"Bayesian Model" = rmse_values
),
main = "RMSE Comparison",
ylab = "RMSE"
)
library(gridExtra)
library(grid)
row_labels <- c(expression(beta[1]), expression(beta[2]), expression(beta[3]),
expression(beta[4]), expression(beta[5]), expression(beta[6]),
expression(beta[7]), expression(beta[8]), expression(beta[9]),
expression(gamma[1]), expression(gamma[2]), expression(gamma[3]),
expression(beta[0]), "CV R²")
col_labels <- c("Linear Regression Model", "Bayesian Regression Model")
data <- matrix(c(
"5.511e+03", "5511.41",
"1.122e+01", "11.22",
"3.589e+00", "3.59",
"1.356e+03", "1359.77",
"-1.622e+03", "-1630.33",
"6.657e+03", "6652.42",
"-5.092e+03", "-5091.12",
"1.241e+03", "1242.18",
"3.637e+03", "3637.24",
"-2.049e+00", "-2.05",
"-3.642e+00", "-3.64",
"-1.477e+03", "-1482.39",
"-2.200e+02", "-220.30",
"0.4705238", "0.4709003"
), ncol = 2, byrow = TRUE)
df <- as.data.frame(data, stringsAsFactors = FALSE)
colnames(df) <- col_labels
rownames(df) <- sapply(row_labels, function(x) as.expression(x))
png("academic_table_truth.png", width = 1000, height = 700, res = 150)
grid.table(df, rows = row_labels, theme = ttheme_default(
core = list(fg_params = list(cex = 1.0)),
colhead = list(fg_params = list(fontface = "bold", cex = 1.1)),
rowhead = list(fg_params = list(fontface = "bold", cex = 1.1))
))
dev.off()
## quartz_off_screen
## 2
Why improve?
The RMSE is kind of big, meaning that our model cannot explain the variation very well.
# set seed and cv parameters
set.seed(123)
train_control <- trainControl(
method = "cv",
number = 10,
verboseIter = TRUE
)
# solution 1: add non linear terms
model_nonlinear <- train(
Fare_scaled ~ Number.Of.Stops + Total_Minutes + I(Total_Minutes^2) +
distance + I(distance^2) + # add squared terms
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak +
Is_Low_Cost:distance +
Total_Minutes:Number.Of.Stops +
Departure.Off.Peak:Arrival.Off.Peak,
data = flight_data_scaled,
method = "lm",
trControl = train_control
)
## + Fold01: intercept=TRUE
## - Fold01: intercept=TRUE
## + Fold02: intercept=TRUE
## - Fold02: intercept=TRUE
## + Fold03: intercept=TRUE
## - Fold03: intercept=TRUE
## + Fold04: intercept=TRUE
## - Fold04: intercept=TRUE
## + Fold05: intercept=TRUE
## - Fold05: intercept=TRUE
## + Fold06: intercept=TRUE
## - Fold06: intercept=TRUE
## + Fold07: intercept=TRUE
## - Fold07: intercept=TRUE
## + Fold08: intercept=TRUE
## - Fold08: intercept=TRUE
## + Fold09: intercept=TRUE
## - Fold09: intercept=TRUE
## + Fold10: intercept=TRUE
## - Fold10: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
print("Results of non-linear model: ")
## [1] "Results of non-linear model: "
print(model_nonlinear$results)
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.6874839 0.52742 0.4724112 0.01164206 0.01275202 0.005514287
# Solution 2: Random Forest
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
model_rf <- train(
Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak,
data = flight_data_scaled,
method = "rf",
trControl = train_control,
ntree = 500,
importance = TRUE
)
## + Fold01: mtry=2
## - Fold01: mtry=2
## + Fold01: mtry=5
## - Fold01: mtry=5
## + Fold01: mtry=9
## - Fold01: mtry=9
## + Fold02: mtry=2
## - Fold02: mtry=2
## + Fold02: mtry=5
## - Fold02: mtry=5
## + Fold02: mtry=9
## - Fold02: mtry=9
## + Fold03: mtry=2
## - Fold03: mtry=2
## + Fold03: mtry=5
## - Fold03: mtry=5
## + Fold03: mtry=9
## - Fold03: mtry=9
## + Fold04: mtry=2
## - Fold04: mtry=2
## + Fold04: mtry=5
## - Fold04: mtry=5
## + Fold04: mtry=9
## - Fold04: mtry=9
## + Fold05: mtry=2
## - Fold05: mtry=2
## + Fold05: mtry=5
## - Fold05: mtry=5
## + Fold05: mtry=9
## - Fold05: mtry=9
## + Fold06: mtry=2
## - Fold06: mtry=2
## + Fold06: mtry=5
## - Fold06: mtry=5
## + Fold06: mtry=9
## - Fold06: mtry=9
## + Fold07: mtry=2
## - Fold07: mtry=2
## + Fold07: mtry=5
## - Fold07: mtry=5
## + Fold07: mtry=9
## - Fold07: mtry=9
## + Fold08: mtry=2
## - Fold08: mtry=2
## + Fold08: mtry=5
## - Fold08: mtry=5
## + Fold08: mtry=9
## - Fold08: mtry=9
## + Fold09: mtry=2
## - Fold09: mtry=2
## + Fold09: mtry=5
## - Fold09: mtry=5
## + Fold09: mtry=9
## - Fold09: mtry=9
## + Fold10: mtry=2
## - Fold10: mtry=2
## + Fold10: mtry=5
## - Fold10: mtry=5
## + Fold10: mtry=9
## - Fold10: mtry=9
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 9 on full training set
print("Results of random forest: ")
## [1] "Results of random forest: "
print(model_rf$results)
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 2 0.6445977 0.6011966 0.4479724 0.009084500 0.01176618 0.007406143
## 2 5 0.5471292 0.7007798 0.3576632 0.008795583 0.01069441 0.006313989
## 3 9 0.5338309 0.7152616 0.3447460 0.009309267 0.01041033 0.006670816
# check feature importance
importance_rf <- varImp(model_rf)
print("feature importance")
## [1] "feature importance"
print(importance_rf)
## rf variable importance
##
## Overall
## distance 100.000
## Total_Minutes 34.386
## Number.Of.Stops 28.597
## Arrival.Off.Peak 20.352
## Departure.Off.Peak 19.863
## Low_Cost_Count 15.483
## Is_Low_Cost 6.500
## IsWeekend 5.392
## ifHoliday 0.000
# Solution 3: Feature Engineering
flight_data_engineered <- flight_data_scaled %>%
mutate(
# create new features
peak_hours = ifelse(Departure.Off.Peak == 0 | Arrival.Off.Peak == 0, 1, 0),
distance_per_minute = distance/Total_Minutes,
total_cost_factor = Is_Low_Cost * Low_Cost_Count,
weekend_holiday = IsWeekend * ifHoliday,
stops_per_distance = Number.Of.Stops/distance
)
model_engineered <- train(
Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak +
peak_hours + distance_per_minute + total_cost_factor +
weekend_holiday + stops_per_distance,
data = flight_data_engineered,
method = "lm",
trControl = train_control
)
## + Fold01: intercept=TRUE
## - Fold01: intercept=TRUE
## + Fold02: intercept=TRUE
## - Fold02: intercept=TRUE
## + Fold03: intercept=TRUE
## - Fold03: intercept=TRUE
## + Fold04: intercept=TRUE
## - Fold04: intercept=TRUE
## + Fold05: intercept=TRUE
## - Fold05: intercept=TRUE
## + Fold06: intercept=TRUE
## - Fold06: intercept=TRUE
## + Fold07: intercept=TRUE
## - Fold07: intercept=TRUE
## + Fold08: intercept=TRUE
## - Fold08: intercept=TRUE
## + Fold09: intercept=TRUE
## - Fold09: intercept=TRUE
## + Fold10: intercept=TRUE
## - Fold10: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
print("Results of feature engineering: ")
## [1] "Results of feature engineering: "
print(model_engineered$results)
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.7246163 0.4749353 0.5071564 0.01426998 0.01644719 0.006927153
# Solution 4:Hierarchical Bayesian Model
# create hierarchical Bayesian Model
bayes_model <- brm(
Fare_scaled ~ Number.Of.Stops + Total_Minutes + distance +
IsWeekend + ifHoliday + Is_Low_Cost + Low_Cost_Count +
Departure.Off.Peak + Arrival.Off.Peak +
Is_Low_Cost:distance +
Total_Minutes:Number.Of.Stops +
Departure.Off.Peak:Arrival.Off.Peak +
# Modify the hierarchical structure to retain only one random effect
(distance | Is_Low_Cost), # Allow different distance effects for different airline types
data = flight_data_scaled,
family = gaussian(),
chains = 4,
iter = 2000,
cores = 4
)
## Compiling Stan program...
## Start sampling
## Warning: There were 362 divergent transitions after warmup. See
## https://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.
## Warning: There were 3608 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: There were 3 chains where the estimated Bayesian Fraction of Missing Information was low. See
## https://mc-stan.org/misc/warnings.html#bfmi-low
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 4.31, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
summary(bayes_model)
## Warning: Parts of the model have not converged (some Rhats are > 1.05). Be
## careful when analysing the results! We recommend running more iterations and/or
## setting stronger priors.
## Warning: There were 362 divergent transitions after warmup. Increasing
## adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
predictions_bayes <- predict(bayes_model, flight_data_scaled)[,"Estimate"]
# Calculate Bayes R2
bayes_r2 <- bayes_R2(bayes_model)
print("bayes R²: ")
print(bayes_r2)
## 1411095 0.4637851 0.5118908
# compare the performance of three models
results_comparison <- data.frame(
Model = c("non-linear model", "random forest", "feature engineering","hierarchical bayesian model"),
RMSE = c(
min(model_nonlinear$results$RMSE),
min(model_rf$results$RMSE),
min(model_engineered$results$RMSE),
sqrt(mean((predictions_bayes - flight_data_scaled$Fare_scaled)^2))
),
Rsquared = c(
max(model_nonlinear$results$Rsquared),
max(model_rf$results$Rsquared),
max(model_engineered$results$Rsquared),
mean(bayes_r2[,1])
)
)
print("model comparison: ")
## [1] "model comparison: "
print(results_comparison)
## Model RMSE Rsquared
## 1 non-linear model 0.6874839 0.5274200
## 2 random forest 0.5338309 0.7152616
## 3 feature engineering 0.7246163 0.4749353
## 4 hierarchical bayesian model 0.7368088 0.4779538
# compare by visualization
# create prediction values for each model
predictions_nonlinear <- predict(model_nonlinear, flight_data_scaled)
predictions_rf <- predict(model_rf, flight_data_scaled)
predictions_engineered <- predict(model_engineered, flight_data_engineered)
predictions_bayes <- predict(bayes_model, flight_data_scaled)[,"Estimate"]
# create comparison plots of prediction values
library(ggplot2)
library(gridExtra)
p1 <- ggplot(data.frame(actual = flight_data_scaled$Fare_scaled,
predicted = predictions_nonlinear),
aes(x = actual, y = predicted)) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "performance of non-linear model")
p2 <- ggplot(data.frame(actual = flight_data_scaled$Fare_scaled,
predicted = predictions_rf),
aes(x = actual, y = predicted)) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "performance of random forest")
p3 <- ggplot(data.frame(actual = flight_data_scaled$Fare_scaled,
predicted = predictions_engineered),
aes(x = actual, y = predicted)) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "performance of feature engineering")
p4 <- ggplot(data.frame(actual = flight_data_scaled$Fare_scaled,
predicted = predictions_bayes),
aes(x = actual, y = predicted)) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Hierarchical Bayesian model prediction performance")
grid.arrange(p1, p2, p3,p4, ncol = 2)